#!/usr/bin/env perl

use warnings;
use strict;
use File::Basename qw/basename dirname/;
use Array::Transpose;
use Carp;

my $usage="
convertHiCMatrix <subcommand> <...>
subcommand:	
compile		#compile a set of matrixes into one table
convert		#just convert BIN name, copy modified content to a new file. Do not actually combine matrixes
query		#retrieve interaction information for certain regions
querysingle	#same as 'query', but only for intrachromosomal interaction
index		#generate index for matrix (at chromosomal resolution)
";
@ARGV>=1 or die $usage,"\n";

my %chr2no=(
    'chr1' => '1', 'chr2' => '2', 'chr3' => '3', 'chr4' => '4',
    'chr5' => '5', 'chr6' => '6', 'chr7' => '7', 'chr8' => '8',
    'chr9' => '9', 'chr10' => '10', 'chr11' => '11', 'chr12' => '12',
    'chr13' => '13', 'chr14' => '14', 'chr15' => '15', 'chr16' => '16',
    'chr17' => '17', 'chr18' => '18', 'chr19' => '19', 'chr20' => '20',
    'chr21' => '21', 'chr22' => '22', 'chrx' => '23', 'chry' => '24',
    'chrmito' => '25', 'chrxy' => '26',
    '1' => '1', '2' => '2', '3' => '3', '4' => '4',
    '5' => '5', '6' => '6', '7' => '7', '8' => '8',
    '9' => '9', '10' => '10', '11' => '11', '12' => '12',
    '13' => '13', '14' => '14', '15' => '15', '16' => '16',
    '17' => '17', '18' => '18', '19' => '19', '20' => '20',
    '21' => '21', '22' => '22', 'x' => '23', 'y' => '24',
    'mito' => '25', 'xy' => '26',
);

my $subcommand=shift @ARGV;
if($subcommand eq "compile")
{
    &compile(@ARGV);
} elsif ($subcommand eq "query")
{
    &query(@ARGV);
} elsif ($subcommand eq "querysingle")
{
    &querysingle(@ARGV);
} elsif ($subcommand eq "convert")
{
    &convert(@ARGV);
} elsif ($subcommand eq "index")
{
    &index(@ARGV);
}else
{
    die $usage,"\n";
}

############################################################
#######################END OF MAIN##########################
############################################################


#read a bunch of matrixes, put them into one table
sub compile
{
    my $usage="compile <build> <binsize> <out-file-prefix> <1.txt 2.txt ...>\n";
    die "$usage\n" if @ARGV<=2;
    #example
    #./convertHiCMatrix compile hg18 1000000 try_hic_1000000 interaction_matrix/HIC_k562*_1000000_exp.txt

    my $build=shift @ARGV;
    my $binsize=shift @ARGV;
    my $outprefix=shift @ARGV;
    my $outfile="$outprefix.$build.txt";
    my $fh;
    my %in_fh; #input file filehandle hash, save overhead on opening
    my $lines=0;
    open $fh,'>',$outfile or die "ERROR: failed to open $outfile ($!)\n";
    for my $i(@ARGV)
    {
	open (my $temp_fh,'<',$i) or die "ERROR: failed to open $i ($!)\n";
	$in_fh{$i}=$temp_fh;
    }

    die "ERROR: genome build must be hg18 or hg19\n" unless $build eq 'hg18' or $build eq 'hg19';
    warn "NOTICE: binsize is $binsize\n";
    warn "NOTICE: begin consolidating individual matrixes\n";
    #prepere header
    my %all_header; #store all numerical chromosome names in keys, values are [chr,pos1,pos2]
    &save_header(\%all_header,\%in_fh,\@ARGV);
    #print header
    &output_header(\%all_header,$fh);
    #prepare content
    for my $i(sort {$a<=>$b} keys %all_header)
    {
	&output_matrix($i,\%all_header,$fh,\%in_fh,$binsize,\$lines);
    }
    warn "NOTICE: content done\n";
    warn "NOTICE: $lines lines written to $outfile\n";
    close $fh;
    close $_ for (values %in_fh);
}

#read header from files and save it to %all_header
sub save_header
{
    my $all_header_ref=shift;
    my $in_fh_ref=shift;

    for my $i(keys %$in_fh_ref) #loop over all input files
    {
	my $colname_ref=&read_raw_matrix_header(${$in_fh_ref}{$i});	
	my $chr=${$colname_ref}[0][0]; #entry in colname: [chr,pos1,pos2]
	if(defined ${$all_header_ref}{$chr})
	{
	    if(@{${$all_header_ref}{$chr}} != @{$colname_ref})
	    {
		&dbg_print_header($all_header_ref);
		die "ERROR: number of BIN names found in $i differs from previous records (",
		scalar(@{${$all_header_ref}{$chr}})," vs ",scalar(@$colname_ref),")\n";
	    }
	}
	else
	{
	    ${$all_header_ref}{$chr}=$colname_ref;
	}
    }
}
sub dbg_print_header
{
    my $header_ref=shift;
    for my $i(keys %$header_ref)
    {
	warn "chr:$i\n";
	for my $j( @{${$header_ref}{$i}})
	{
	    my ($chr,$pos1,$pos2)=@$j;
	    warn "chr:$chr,pos1:$pos1,pos2:$pos2\n";
	}
    }
}
#print header
sub output_header
{
    my $header_ref=shift @_;
    my $fh=shift @_;
    my @f;

    #keys are numerical chr
    for my $i(sort {$a<=>$b} keys %$header_ref)
    {
	#entry is [chr,pos1,pos2]
	#suppose entries are sorted
	for my $j( @{${$header_ref}{$i}} )
	{ 
	    #warn "outputing chr:$i\n";
	    #&dbg_print_header($header_ref);
	    my ($chr,$pos1,$pos2)=@$j;
	    push @f,"$chr:$pos1-$pos2";
	}
    }

    print $fh join ("\t"," ",@f),"\n";
    warn "NOTICE: header printed\n";
}
#loop over all files again and again
#until we find all interaction data 
#for the specified chromosome
sub output_matrix
{
    my $chr=shift @_; #chr of interest
    #header, it's used to show what chromosome we
    #need to look at for the chr of interest
    my $header_ref=shift @_; 
    my $fh=shift @_;
    my $in_fh_ref=shift @_;
    my $binsize=shift;
    my $line_ref=shift;

    #huge matrix storing interaction data for all pairwise interaction for chr of interest
    #luckily the data itself should be around a few hundred MB
    my %out; #use pos1 in each bin as key, store interaction with every other chromosome as value

    #loop over all chromosomes
    for my $i(sort {$a<=>$b} keys %$header_ref)
    {
	#filename: HIC_gm06690_chr10_chr10_1000000_exp.txt
	for my $j(keys %$in_fh_ref)
	{
	    my $base=basename $j;
	    my ($prefix)= $base=~m/^(.*?)\.txt$/ or die "ERROR: .txt file expected ($j)\n";
	    my ($assay,$cell,$chr1,$chr2,$local_binsize,$type)=split /_/,$prefix;
	    die "ERROR: failed to parse filename ($prefix)\n" unless defined $chr1 && defined $chr2 && defined $local_binsize;
	    die "ERROR: incorrect bin size ($local_binsize vs $binsize) in $j\n" unless $local_binsize == $binsize;
	    $chr1=lc $chr1;
	    $chr2=lc $chr2;
	    next unless $chr2no{$chr1} eq $chr && $chr2no{$chr2} eq $i
		or $chr2no{$chr1} eq $i   && $chr2no{$chr2} eq $chr;

	    &read_raw_block($chr,${$in_fh_ref}{$j},\%out);
	    last; #only need to output chr1,chr2 interaction once
	}
    }
    &output_onechr_interaction($chr,$header_ref,\%out,$fh,$line_ref);
}
#similar to 'query', but only for intrachromosomal interactions
sub querysingle
{
    my $usage="querysingle <intrachromosomal file> <output file> <region>\n";
    die "$usage\n" if @ARGV!=3;
    warn "WARNING: ***************************\n";
    warn "WARNING: FIRST line must be HEADER!!\n";
    warn "WARNING: ***************************\n";
    my $file=shift;
    my $out=shift;
    my $region=shift;
    &query($file,$out,$region);
}
#retrieve values from certain regions
#if only one region is specified, return interaction values for all genome with that region
#if two regions are specified, only return interaction values between the two regions
sub query
{
    #query string example
    #1:1000-2000 2:9999-1000000
    #return string example
    ####################################################
    #SPACE	2:1-9999	2:10000-19999	...    #
    #1:1000-1999	8.1	1.4	...	       #
    #1:2000-2999	1.2	19.1	...	       #
    ####################################################
    my $usage="query <file> <output file> <region1> <region2>\n".
    "<region1> chr:pos-pos, chr is numerical\n".
    "<region2> chr:pos-pos, chr is numerical\n".
    "If only one region is specified, output interaction between entire genome and this region\n".
    "Otherwise, output interaction that is involved between the two regions\n".
    "20140816change: if two regions are specified, output the interaction between first region\n".
    " and the chromosome defined by the second region, i.e. position will be ignored!!!\n";
    die "$usage\n" unless @ARGV==4 or @ARGV==3;
    my $file=shift;
    my $idx="$file.idx";
    my $output=shift;
    my $region1=shift;
    my $region2=shift;
    my $out; #output filehandle
    my $lines=0;
    warn "WARNING: position in region2 will be ignored!!!\n";

    die "ERROR: no index (expect $idx)\n" unless -e $idx;
    open (my $in ,"<",$file) or die "$file: $!\n";
    if($output eq 'STDOUT')
    {
	$out=\*STDOUT;
    } else
    {
	open ($out,">",$output) or die "$output: $!\n";
    }
    $lines=&gen_query_output({in=>$in,out=>$out,idx=>$idx,region1=>$region1,region2=>$region2});
    close $in;
    close $out;
    warn "NOTICE: $lines lines written to $output\n";
}
#output interaction matrix specified by one region or two regions
sub gen_query_output
{
    my $config=shift;
    my $idx_ref=&read_idx($config->{idx});
    my $lines=0;
    my @col_needed=&find_col_needed($config->{in},$config->{region2}); #starts with 0!!!
#when region2 is specified, col_needed is limited by it
#otherwise, we output all interactions for region1
    my @header=&get_header($config->{in});
    #0 field (empty) will not be output, for R read.table()
    print {$config->{out}} join ("\t",@header[grep {$_!=0} @col_needed]),"\n";
    $lines++;

    my ($chr,$pos1,$pos2)= $config->{region1}=~/(.*?):(\d+)-(\d+)/;
    die "ERROR: failed to parse region1 ($config->{region1})\n" unless defined $chr && defined $pos1 && defined $pos2;
    die "ERROR: pos2 must be larger than pos1\n" if $pos1>$pos2;
    seek $config->{in},$idx_ref->{$chr},0 or die "seek failed: $!\n";
    while(readline $config->{in}) #since filehandle is stored as hash value, readline must be used instead of <>
    {
	s/[\r\n]+$//;
	my @f=split /\t/;
	die "ERROR: ",scalar(@header)," fields expected at line $_\n" unless @f==@header;
	die "ERROR: failed to parse row name at line $_\n" unless $f[0]=~/^(.*?):(\d+)-(\d+)/;
	my ($local_chr,$local_pos1,$local_pos2)=($1,$2,$3);
	#if($local_chr eq $chr && $local_pos2 >= $pos1)
	#{
	#    if($local_pos1 <= $pos2)
	#    {
	if($local_chr eq $chr && $local_pos1 <= $pos2 && $local_pos2 >= $pos1)
	{
	    #note: when filehandle is a hash value, you have to deference it before printing to it
	    print {$config->{out}} join ("\t",@f[@col_needed]),"\n";
	    $lines++;
	} elsif ($local_chr > $chr || $local_pos1 > $pos2)
	{#suppose records are sorted based on position, same chromosome records are grouped
	    last;
	}
	#}
    }
    return $lines;
}
#fetch header (1st line)
sub get_header
{
    my $in=shift;
    seek $in,0,0 or die "seek failed: $!";
    my $header=<$in>;
    $header=~s/[\r\n]+$//;
    return split /\t/,$header;
}
#based on header in in and region, output colnum indexes matching region
sub find_col_needed
{
    my $in=shift;
    my $region=shift;
    my @return;

    my @f=&get_header($in);
    if(defined $region)
    {
	push @return,0; #0 is empty
	my ($chr,$pos1,$pos2)= $region=~/^(.*?):(\d+?)-(\d+)/;
	die "ERROR: failed to parse region ($region)\n" if( (!defined $chr) or (!defined $pos1) or (!defined $pos2));
        die "ERROR: pos2 must be larger than pos1\n" if $pos1>$pos2;
	for my $i(1..$#f)
	{
	    die "ERROR: failed to parse column name\n" unless $f[$i]=~/^(.*?):(\d+)-(\d+)/;
	    my ($local_chr,$local_pos1,$local_pos2)=($1,$2,$3);
	    if($local_chr) #pos1 and pos2 are ignored HERE!!!
	    {
		push @return,$i;
	    } elsif ($local_chr > $chr)
	    {#suppose records are sorted based on position, same chromosome records are grouped
		last;
	    }
	}
    }
    else
    {
	@return=(0..$#f);
    }
    return @return;
}
#record position of first apperance at row names for each chromosome
sub index
{
    my $usage="index <file1 file2 ...>\n";
    die "$usage\n" unless @ARGV>=1;
    while(my $file=shift)
    {
	my $idx="$file.idx";
	open IN,"<",$file or die "$file: $!\n";
	open OUT,">",$idx or die "$idx $!\n";
	warn "WARNING: assuming 1st line is column name, the rest is interaction matrix\n";
	my $header=<IN>;
	my $prev_chr;
	my $prev_pos=tell;

	while(<IN>)
	{
	    die "ERROR: failed to parse row name at line $. of $file\n" unless /^(.*?):\d+-\d+/;
	    my $chr=$1;
	    if(!defined $prev_chr or $chr ne $prev_chr)
	    {
		print OUT "$chr\t$prev_pos\n";
		$prev_chr=$chr;
	    }
	    $prev_pos=tell;
	}
	close IN;
	close OUT;
	warn "NOTICE: $idx done\n";
    }
}
#read index
sub read_idx
{
    my $idx=shift;
    my %return;
    open IN,"<",$idx or die "$idx: $!\n";
    while(<IN>)
    {
	s/[\r\n]+$//;
	my @f=split;
	die "ERROR: 2 fields epxected at line $. of $idx\n" unless @f==2;
	die "ERROR: duplicate key at line $. of $idx\n" if defined $return{$f[0]};
	$return{$f[0]}=$f[1];
    }
    close IN;
    return \%return;
}
#output all interaction for the specified chromosome
sub output_onechr_interaction
{
    my $chr=shift;
    my $header_ref=shift;
    my $out_ref=shift;
    my $fh=shift;
    my $line_ref=shift;
    my $prev_pos;

    for my $i( @{${$header_ref}{$chr}} )
    {
	my ($chr,$pos1,$pos2)=@$i;
	#bin name comes from colname, make sure it's sorted incrementally
	#row name doesn't matter, it's stored in hash (%out)
	if(defined $prev_pos)
	{
	    die "ERROR: header not sorted for $chr\n" unless $pos1>$prev_pos;
	    $prev_pos=$pos1;
	} else
	{
	    $prev_pos=$pos1;
	}
	print $fh join("\t","$chr:$pos1-$pos2",@{${$out_ref}{$pos1}}),"\n";
	$$line_ref++;
    }

}
#read one file(block) into the hash for output
sub read_raw_block
{
    my $chr_interest=shift;
    my $fh=shift;
    my $out_ref=shift;
    my @block_array;
    my $read_by_row=1;
    my $field_count; #make sure every line has same number of fields

    seek $fh,0,0;
    $.=0;
    while(<$fh>)
    {
	next if /^#|^\s+$/;
	s/[\r\n]+$//;
	my @f=split;
	unshift @f," " if $.==2;
	push @block_array,[@f];
	if (defined $field_count)
	{
	    die "ERROR: unequal number of fields at line $." unless $field_count==@f;
	}
	else
	{
	    $field_count=@f;
	}
    }
    for my $i(@block_array[1..$#block_array])
    {
	#the order of bin has been checked earlier
	my ($chr,$pos1,$pos2)=@{&extract_chr_pos(${$i}[0])};
	die "ERROR: unable to recognize row name (${$i}[0])\n" unless defined $chr && defined $pos1 && defined $pos2;
	if ($chr ne $chr_interest)
	{
		$read_by_row=0;
	}
	last;
    }
    if(!$read_by_row)
    {
	#transpose @block_array
	@block_array=transpose(\@block_array);
    }
    for my $i(@block_array[1..$#block_array])
    {
	#the order of bin has been checked earlier
	my ($chr,$pos1,$pos2)=@{&extract_chr_pos(${$i}[0])};
	die "ERROR: unable to recognize row name (${$i}[0])\n" unless defined $chr && defined $pos1 && defined $pos2;
	if (defined ${$out_ref}{$pos1})
	{
	    push @{${$out_ref}{$pos1}},@{$i}[1..$#{$i}];
	} else
	{
	    ${$out_ref}{$pos1}=[@{$i}[1..$#{$i}]];
	}
    }
}

#read raw interaction matrix
sub read_raw_matrix_header
{
    my $fh=shift @_;
    my @colname;
    seek $fh,0,0;
    while(<$fh>)
    {
	next if /^#/;
	s/^\s+|[\r\n]+$//;
	chomp;
	if($.==2) #header line
	{
	    @colname=split;
	    last;
	    #because all pairwise interactions are included
	    #column names will be sufficient to get to know
	    #all BIN names
	}
    }
    for my $i(@colname)
    {#colname stores ref to each BIN name
	$i=&extract_chr_pos($i);
    }
    return \@colname;
}

sub extract_chr_pos
{
    my $s=shift @_;
    my @f=split /[\|\.]/,$s;
    my ($chr,$pos1,$pos2);

    die ("ERROR: unable to parse header ($s)\n") unless @f>1;
    ($chr,$pos1,$pos2)=split /[:\-]/,$f[$#f];
    die "ERROR: unable to parse chromosome and position ($s)\n" unless 
    defined $chr && defined $pos1 && defined $pos2;
    $chr=lc $chr;
    die "ERROR: unable to recognize chromosome name ($chr)\n" unless defined $chr2no{$chr};
    return [$chr2no{$chr},$pos1,$pos2];
}
#convert BIN names to chr:pos-pos format without changing the content
sub convert
{
    my $usage="convert <file1 file2 ...>";
    die "$usage\n" unless @ARGV>=1;
    for my $i(@_)
    {
	my $out="$i.BINname_converted";
	my @header;
	my @colname;
	open (my $fh,"<",$i) or die "$i:$!\n";
	open (my $outfh,">",$out) or die "$out:$!\n";
	while(<$fh>)
	{
	    next if /^#|^\s*$/;
	    s/[\r\n]+$//;
	    my @f=split; #leading whitespaces are removed
	    die "ERROR: ",scalar(@header)," fields expected at line $. of $i\n" if @header && @f!=@header+1;
	    if(!@header)
	    {
		@header=@f; #1st field of header is empty
		for my $j(@f)
		{
		    my ($chr,$pos1,$pos2)=@{&extract_chr_pos($j)};
		    $j="$chr:$pos1-$pos2";
		}
		print $outfh join("\t"," ",@f),"\n";
	    }
	    else
	    {
		push @colname,$f[0];
		my ($chr,$pos1,$pos2)=@{&extract_chr_pos($f[0])};
		$f[0]="$chr:$pos1-$pos2";
		print $outfh join("\t",@f),"\n";
	    }
	}
	close $fh;
	close $outfh;
	for my $j(0..$#header)
	{
	    if(!defined $colname[$j] or $header[$j] ne $colname[$j])
	    {
		unlink $out;
		die "ERROR: row name and column name inconsistent ($header[$j] vs $colname[$j]) in $i\n";
	    }
	}
	warn "NOTICE: $out done\n";
    }
}

#example output

=head

#hg18
	1:1-999999	1:1000000-1999999	...
1:1-999999	0	0	...

=cut

#example input

=head

#GMBothall.0.maq.ctg1.ctg22.1000000bp.hm.newtracks12forBryan.heatmap.matrix.tab
	HIC_bin1|hg18|chr22:1-999999	HIC_bin2|hg18|chr22:1000000-1999999	HIC_bin3|hg18|chr22:2000000-2999999	HIC_bin4|hg18|chr22:3000000-3999999	HIC_bin5|hg18|chr22:4000000-4999999	HIC_bin6|hg18|chr22:5000000-5999999	HIC_bin7|hg18|chr22:6000000-6999999	HIC_bin8|hg18|chr22:7000000-7999999	HIC_bin9|hg18|chr22:8000000-8999999	HIC_bin10|hg18|chr22:9000000-9999999	HIC_bin11|hg18|chr22:10000000-10999999	HIC_bin12|hg18|chr22:11000000-11999999	HIC_bin13|hg18|chr22:12000000-12999999	HIC_bin14|hg18|chr22:13000000-13999999	HIC_bin15|hg18|chr22:14000000-14999999	HIC_bin16|hg18|chr22:15000000-15999999	HIC_bin17|hg18|chr22:16000000-16999999	HIC_bin18|hg18|chr22:17000000-17999999	HIC_bin19|hg18|chr22:18000000-18999999	HIC_bin20|hg18|chr22:19000000-19999999	HIC_bin21|hg18|chr22:20000000-20999999	HIC_bin22|hg18|chr22:21000000-21999999	HIC_bin23|hg18|chr22:22000000-22999999	HIC_bin24|hg18|chr22:23000000-23999999	HIC_bin25|hg18|chr22:24000000-24999999	HIC_bin26|hg18|chr22:25000000-25999999	HIC_bin27|hg18|chr22:26000000-26999999	HIC_bin28|hg18|chr22:27000000-27999999	HIC_bin29|hg18|chr22:28000000-28999999	HIC_bin30|hg18|chr22:29000000-29999999	HIC_bin31|hg18|chr22:30000000-30999999	HIC_bin32|hg18|chr22:31000000-31999999	HIC_bin33|hg18|chr22:32000000-32999999	HIC_bin34|hg18|chr22:33000000-33999999	HIC_bin35|hg18|chr22:34000000-34999999	HIC_bin36|hg18|chr22:35000000-35999999	HIC_bin37|hg18|chr22:36000000-36999999	HIC_bin38|hg18|chr22:37000000-37999999	HIC_bin39|hg18|chr22:38000000-38999999	HIC_bin40|hg18|chr22:39000000-39999999	HIC_bin41|hg18|chr22:40000000-40999999	HIC_bin42|hg18|chr22:41000000-41999999	HIC_bin43|hg18|chr22:42000000-42999999	HIC_bin44|hg18|chr22:43000000-43999999	HIC_bin45|hg18|chr22:44000000-44999999	HIC_bin46|hg18|chr22:45000000-45999999	HIC_bin47|hg18|chr22:46000000-46999999	HIC_bin48|hg18|chr22:47000000-47999999	HIC_bin49|hg18|chr22:48000000-48999999	HIC_bin50|hg18|chr22:49000000-49691431	
HIC_bin1|hg18|chr1:1-999999	0	0	0	0	0	0	0	0	0	0	0	0	0	0	4.0	9.0	14.0	9.0	10.0	17.0	9.0	14.0	7.0	6.0	11.0	14.0	9.0	7.0	10.0	16.0	11.0	8.0	11.0	13.0	15.0	13.0	11.0	15.0	10.0	9.0	9.0	12.0	13.0	11.0	14.0	10.0	12.0	15.0	24.0	33.0

=cut
